use ../data/Census.dta, clear

assert racesing==1 & sex==1 // white males pre-selected

// check birth year
tab birthyr year, m
// limit to 40-49 year olds by yob
keep if (year==1960 & birthyr>=1910 & birthyr<=1919) ///
 | (year==1970 & birthyr>=1920 & birthyr<=1929) ///
 | (year==1980 & birthyr>=1930 & birthyr<=1939)

tab bpl, m
assert bpl<=56 // born in the US
keep if bpl!=2 & bpl!=15 // drop Alaska Hawaii (no CSL data)

// check education variable
tab educd year, m
// generate consistent measure of year of schooling
recode educd (0 2 11 12=0)(14=1)(15=2)(16=3)(17=4)(22=5)(23=6) ///
(25=7)(26=8)(30=9)(40=10)(50=11)(60/62 65=12)(70=13)(80 82 83=14)(90=15) ///
(100 101=16)(110=17)(111/116=18), gen(yschl)
tab yschl year, m
tabstat yschl, by(educd) stat(mean sd N) col(stat) 

// Tabulate wage data
tabstat incwage, by(year) stat(mean sd min p10 q p90 max N) col(stat)
tabstat incwage if incwage>0, by(year) stat(mean sd min p10 q p90 max N) col(stat)

assert wkswork2>0 // at least one week 
assert !missing(incwage)
keep if incwage>0 // positive wage earnings

// weeks worked
tab wkswork2 year, m
tabstat wkswork1, by(year) stat(mean sd min p10 q p90 max N) col(stat)

// assign 1980 categorical means to 1960-70
forvalues j=1/6{
	sum wkswork1 if wkswork2==`j' & year==1980
	replace wkswork1 = r(mean) if wkswork2==`j' & year<1980
}
tab wkswork1 wkswork2 if year<1980, m
tabstat wkswork1, by(year) stat(mean sd min p10 q p90 max N) col(stat)
tabstat wkswork1 if wkswork2<6, by(year) stat(mean sd min p10 q p90 max N) col(stat)

// set minimum weekly earning to federal minimum wage * 20
// set maximum weekly earning to top-code earning value / 52
recode year (1960=20)(1970=26)(1980=58), gen(minw)
recode year (1960=25)(1970=50)(1980=75), gen(maxw)
replace maxw = maxw*1000/52
gen wkly_wage = incwage/wkswork1
tabstat wkly_wage, by(year) stat(mean sd min p1 p5 p10 q p90 p95 p99 max N) col(stat)
replace wkly_wage = minw if wkly_wage<minw 
replace wkly_wage = maxw if wkly_wage>maxw
tabstat wkly_wage, by(year) stat(mean sd min p1 p5 p10 q p90 p95 p99 max N) col(stat)

gen nobs=1
gen wgt = 1 if year==1980 // 5% sample
replace wgt = 2.5 if year==1970 // 2% sample
replace wgt = 5 if year==1960 // 1% sample

gen logwage = log(wkly_wage)

forvalues y=1960(10)1980{
	tabstat wkly_wage logwage if year==`y', col(stat) stat(mean sd min p10 q p90 max N)
}

// CSL (Acemoglu and Angrist, 2001)
rename bpl birthpl
rename birthyr YOB
merge m:1 birthpl YOB using ../data/CSL.dta, keepusing(CA CL enroll_age drop_age req_sch) 
drop if _merge==2
assert _merge==3
rename birthpl bpl
rename YOB birthyr
drop _merge

tab CL year [fw=nobs], m
tab CA year [fw=nobs], m
// create CL category as in AA
gen CL6 = (CL<=6)
gen CL7 = (CL==7)
gen CL8 = (CL==8)
gen CL9 = (CL>=9)
// correct coding of CA (Stephens and Yang, 2014)
assert !missing(drop_age) 
assert !missing(enroll_age)
assert !missing(req_sch)
gen CCA = drop_age-enroll_age
replace CCA = min(CCA,req_sch) if req_sch>0
gen CCA7 = (CCA<=7)
gen CCA8 = (CCA==8)
gen CCA9 = (CCA==9)
gen CCA10= (CCA>=10)
gen age_yob = (year-1)-birthyr

// Required Schooling Years (Stephens and Yang, 2014)
rename bpl bplg
rename birthyr yob
merge m:1 bplg yob using ../data/schooling_laws_sy.dta
drop if _merge==2
assert _merge==3
rename bplg bpl
rename yob birthyr
rename numlawyears RS
drop _merge
gen RS6 = (RS<=6)
gen RS7 = (RS==7)
gen RS8 = (RS==8)
gen RS9 = (RS>=9)

// descriptive stats by Census year
forvalues y=1960(10)1980{
	estpost tabstat wkly_wage yschl age_yob CL6 CL7 CL8 CL9 CCA7 CCA8 CCA9 CCA10 RS6 RS7 RS8 RS9 [fw=nobs] if year==`y', col(stat) stat(mean sd min q max N)
	esttab using ../result/sumstat`y'.txt, cells("mean sd count") replace	
}


collapse (sum) wgt nobs (mean) logwage wkly_wage CL CA CCA RS, by(year birthyr birthqtr bpl yschl) 


compress
save ../processed/Census_aggr.dta, replace
